{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# SQL functions\n",
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# SparkContext and SparkSession"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark import SparkContext\n",
    "sc = SparkContext(master = 'local')\n",
    "\n",
    "from pyspark.sql import SparkSession\n",
    "spark = SparkSession.builder \\\n",
    "          .appName(\"Python Spark SQL basic example\") \\\n",
    "          .config(\"spark.some.config.option\", \"some-value\") \\\n",
    "          .getOrCreate()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Import data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+-----------+------------+-----------+-------+\n",
      "|sepal_length|sepal_width|petal_length|petal_width|species|\n",
      "+------------+-----------+------------+-----------+-------+\n",
      "|         5.1|        3.5|         1.4|        0.2| setosa|\n",
      "|         4.9|        3.0|         1.4|        0.2| setosa|\n",
      "|         4.7|        3.2|         1.3|        0.2| setosa|\n",
      "|         4.6|        3.1|         1.5|        0.2| setosa|\n",
      "|         5.0|        3.6|         1.4|        0.2| setosa|\n",
      "+------------+-----------+------------+-----------+-------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "iris = spark.read.csv('data/iris.csv', header=True, inferSchema=True)\n",
    "iris.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+-----------+---+------------+---+------------+-------+-----+------------+\n",
      "|      lcavol|    lweight|age|        lbph|svi|         lcp|gleason|pgg45|        lpsa|\n",
      "+------------+-----------+---+------------+---+------------+-------+-----+------------+\n",
      "|-0.579818495|2.769458829| 50|-1.386294361|  0|-1.386294361|      6|    0|-0.430782916|\n",
      "|-0.994252273|3.319625728| 58|-1.386294361|  0|-1.386294361|      6|    0|-0.162518929|\n",
      "|-0.510825624|2.691243083| 74|-1.386294361|  0|-1.386294361|      7|   20|-0.162518929|\n",
      "|-1.203972804|3.282789151| 58|-1.386294361|  0|-1.386294361|      6|    0|-0.162518929|\n",
      "| 0.751416089|3.432372999| 62|-1.386294361|  0|-1.386294361|      6|    0| 0.371563556|\n",
      "+------------+-----------+---+------------+---+------------+-------+-----+------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "prostate = spark.read.csv('data/prostate.csv', header=True, inferSchema=True)\n",
    "prostate.show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.sql.functions import *\n",
    "from pyspark.sql.types import *\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `abs`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+-----------+\n",
      "|        lpsa|  abs(lpsa)|\n",
      "+------------+-----------+\n",
      "|-0.430782916|0.430782916|\n",
      "|-0.162518929|0.162518929|\n",
      "|-0.162518929|0.162518929|\n",
      "|-0.162518929|0.162518929|\n",
      "| 0.371563556|0.371563556|\n",
      "+------------+-----------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "prostate.select('lpsa', abs(prostate.lpsa).alias('abs(lpsa)')).show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `acos`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+\n",
      "|                   x|\n",
      "+--------------------+\n",
      "| -0.9338356359616288|\n",
      "|-0.17825806390480148|\n",
      "| -0.8287101229670288|\n",
      "| -0.9203268470931772|\n",
      "| -0.5717064564704842|\n",
      "+--------------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "pdf = pd.DataFrame({\n",
    "    'x': list(-np.random.rand(5)) + list(np.random.rand(5))\n",
    "})\n",
    "df = spark.createDataFrame(pdf)\n",
    "df.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+------------------+\n",
      "|                   x|           ACOS(x)|\n",
      "+--------------------+------------------+\n",
      "| -0.9338356359616288| 2.775786316206805|\n",
      "|-0.17825806390480148|1.7500122036992714|\n",
      "| -0.8287101229670288|2.5475953864778313|\n",
      "| -0.9203268470931772|2.7397115973218678|\n",
      "| -0.5717064564704842|2.1793805606775827|\n",
      "+--------------------+------------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.select('x', acos(df.x)).show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `add_months`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------+\n",
      "|     dates|\n",
      "+----------+\n",
      "|2017-06-25|\n",
      "|2017-06-26|\n",
      "|2017-06-27|\n",
      "|2017-06-28|\n",
      "|2017-06-29|\n",
      "+----------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "base = datetime.date.today()\n",
    "date_list = [base + datetime.timedelta(days=x) for x in list(range(0, 10))*10]\n",
    "pdf = pd.DataFrame({\n",
    "    'dates': date_list\n",
    "})\n",
    "df = spark.createDataFrame(pdf)\n",
    "df.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------+----------+\n",
      "|     dates| new_dates|\n",
      "+----------+----------+\n",
      "|2017-06-25|2017-08-25|\n",
      "|2017-06-26|2017-08-26|\n",
      "|2017-06-27|2017-08-27|\n",
      "|2017-06-28|2017-08-28|\n",
      "|2017-06-29|2017-08-29|\n",
      "+----------+----------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.select('dates', add_months(df.dates, 2).alias('new_dates')).show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `approx_count_distinct`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------------------+\n",
      "|approx_count_distinct(gleason)|\n",
      "+------------------------------+\n",
      "|                             4|\n",
      "+------------------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "prostate.select(approx_count_distinct(prostate.gleason)).show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------------------+\n",
      "|approx_count_distinct(species)|\n",
      "+------------------------------+\n",
      "|                             3|\n",
      "+------------------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "iris.select(approx_count_distinct(iris.species)).show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `array`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+-----------+------------+-----------+-------+\n",
      "|sepal_length|sepal_width|petal_length|petal_width|species|\n",
      "+------------+-----------+------------+-----------+-------+\n",
      "|         5.1|        3.5|         1.4|        0.2| setosa|\n",
      "|         4.9|        3.0|         1.4|        0.2| setosa|\n",
      "|         4.7|        3.2|         1.3|        0.2| setosa|\n",
      "|         4.6|        3.1|         1.5|        0.2| setosa|\n",
      "|         5.0|        3.6|         1.4|        0.2| setosa|\n",
      "+------------+-----------+------------+-----------+-------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "iris.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+--------------------+\n",
      "|species|            features|\n",
      "+-------+--------------------+\n",
      "| setosa|[5.1, 3.5, 1.4, 0.2]|\n",
      "| setosa|[4.9, 3.0, 1.4, 0.2]|\n",
      "| setosa|[4.7, 3.2, 1.3, 0.2]|\n",
      "| setosa|[4.6, 3.1, 1.5, 0.2]|\n",
      "| setosa|[5.0, 3.6, 1.4, 0.2]|\n",
      "+-------+--------------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df_arr = iris.select('species', array(['sepal_length', 'sepal_width', 'petal_length', 'petal_width']).alias('features'))\n",
    "df_arr.show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `array_contains`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+--------------------+------------+\n",
      "|species|            features|new_features|\n",
      "+-------+--------------------+------------+\n",
      "| setosa|[5.1, 3.5, 1.4, 0.2]|        true|\n",
      "| setosa|[4.9, 3.0, 1.4, 0.2]|        true|\n",
      "| setosa|[4.7, 3.2, 1.3, 0.2]|       false|\n",
      "| setosa|[4.6, 3.1, 1.5, 0.2]|       false|\n",
      "| setosa|[5.0, 3.6, 1.4, 0.2]|        true|\n",
      "+-------+--------------------+------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df = df_arr.select('species', 'features', array_contains(df_arr.features, 1.4).alias('new_features'))\n",
    "df.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+--------------------+------------+\n",
      "|species|            features|new_features|\n",
      "+-------+--------------------+------------+\n",
      "| setosa|[5.1, 3.5, 1.4, 0.2]|        true|\n",
      "| setosa|[4.9, 3.0, 1.4, 0.2]|        true|\n",
      "| setosa|[5.0, 3.6, 1.4, 0.2]|        true|\n",
      "| setosa|[4.6, 3.4, 1.4, 0.3]|        true|\n",
      "| setosa|[4.4, 2.9, 1.4, 0.2]|        true|\n",
      "+-------+--------------------+------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.filter(df.new_features).show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `asc`\n",
    "\n",
    "`asc` returns a **sort expression**, which can be used as argument of sort functions such as `pyspark.sql.DataFrame.sort` and `pyspark.sql.DataFrame.orderBy`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+-----------+---+------------+---+------------+-------+-----+------------+\n",
      "|      lcavol|    lweight|age|        lbph|svi|         lcp|gleason|pgg45|        lpsa|\n",
      "+------------+-----------+---+------------+---+------------+-------+-----+------------+\n",
      "|-0.579818495|2.769458829| 50|-1.386294361|  0|-1.386294361|      6|    0|-0.430782916|\n",
      "|-0.994252273|3.319625728| 58|-1.386294361|  0|-1.386294361|      6|    0|-0.162518929|\n",
      "|-1.203972804|3.282789151| 58|-1.386294361|  0|-1.386294361|      6|    0|-0.162518929|\n",
      "|-0.510825624|2.691243083| 74|-1.386294361|  0|-1.386294361|      7|   20|-0.162518929|\n",
      "| 0.751416089|3.432372999| 62|-1.386294361|  0|-1.386294361|      6|    0| 0.371563556|\n",
      "+------------+-----------+---+------------+---+------------+-------+-----+------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "prostate.sort(prostate.lpsa.asc()).show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+-----------+---+------------+---+------------+-------+-----+------------+\n",
      "|      lcavol|    lweight|age|        lbph|svi|         lcp|gleason|pgg45|        lpsa|\n",
      "+------------+-----------+---+------------+---+------------+-------+-----+------------+\n",
      "|-0.579818495|2.769458829| 50|-1.386294361|  0|-1.386294361|      6|    0|-0.430782916|\n",
      "|-0.994252273|3.319625728| 58|-1.386294361|  0|-1.386294361|      6|    0|-0.162518929|\n",
      "|-1.203972804|3.282789151| 58|-1.386294361|  0|-1.386294361|      6|    0|-0.162518929|\n",
      "|-0.510825624|2.691243083| 74|-1.386294361|  0|-1.386294361|      7|   20|-0.162518929|\n",
      "| 0.751416089|3.432372999| 62|-1.386294361|  0|-1.386294361|      6|    0| 0.371563556|\n",
      "+------------+-----------+---+------------+---+------------+-------+-----+------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "prostate.orderBy(prostate.lpsa.asc()).show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* `ascii`\n",
    "* `asin`\n",
    "* `atan`\n",
    "* `atan2`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `avg`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------+\n",
      "|         avg(lpsa)|\n",
      "+------------------+\n",
      "|2.4783868787422683|\n",
      "+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "prostate.select(avg(prostate.lpsa)).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* `base64`\n",
    "* `bin`\n",
    "* `bitwiseNOT`\n",
    "* `broadcast`\n",
    "* `bround`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `cbrt`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+-------------------+\n",
      "|        lpsa|         CBRT(lpsa)|\n",
      "+------------+-------------------+\n",
      "|-0.430782916|-0.7552420410177275|\n",
      "|-0.162518929|-0.5457176294010901|\n",
      "|-0.162518929|-0.5457176294010901|\n",
      "|-0.162518929|-0.5457176294010901|\n",
      "| 0.371563556| 0.7189152621521183|\n",
      "+------------+-------------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "prostate.select('lpsa', cbrt(prostate.lpsa)).show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `ceil`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+----------+\n",
      "|        lpsa|CEIL(lpsa)|\n",
      "+------------+----------+\n",
      "|-0.430782916|         0|\n",
      "|-0.162518929|         0|\n",
      "|-0.162518929|         0|\n",
      "|-0.162518929|         0|\n",
      "| 0.371563556|         1|\n",
      "+------------+----------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "prostate.select('lpsa', ceil(prostate.lpsa)).show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `coalesce`\n",
    "\n",
    "Return the first column that is not null."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----+----+\n",
      "|   a|   b|\n",
      "+----+----+\n",
      "|null|null|\n",
      "|   1|null|\n",
      "|null|   2|\n",
      "+----+----+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df = spark.createDataFrame([(None, None), (1, None), (None, 2)], (\"a\", \"b\"))\n",
    "df.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------+\n",
      "|coalesce(a, b)|\n",
      "+--------------+\n",
      "|          null|\n",
      "|             1|\n",
      "|             2|\n",
      "+--------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.select(coalesce(df.a, df.b)).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `col`\n",
    "\n",
    "Returns a **Column** based on the given column name. It can save your some typing when the dataframe is very long."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+-----------+---+------------+---+------------+-------+-----+------------+\n",
      "|      lcavol|    lweight|age|        lbph|svi|         lcp|gleason|pgg45|        lpsa|\n",
      "+------------+-----------+---+------------+---+------------+-------+-----+------------+\n",
      "|-0.579818495|2.769458829| 50|-1.386294361|  0|-1.386294361|      6|    0|-0.430782916|\n",
      "|-0.994252273|3.319625728| 58|-1.386294361|  0|-1.386294361|      6|    0|-0.162518929|\n",
      "|-0.510825624|2.691243083| 74|-1.386294361|  0|-1.386294361|      7|   20|-0.162518929|\n",
      "|-1.203972804|3.282789151| 58|-1.386294361|  0|-1.386294361|      6|    0|-0.162518929|\n",
      "| 0.751416089|3.432372999| 62|-1.386294361|  0|-1.386294361|      6|    0| 0.371563556|\n",
      "+------------+-----------+---+------------+---+------------+-------+-----+------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "prostate.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+---+\n",
      "|      lcavol|age|\n",
      "+------------+---+\n",
      "|-0.579818495| 50|\n",
      "|-0.994252273| 58|\n",
      "|-0.510825624| 74|\n",
      "|-1.203972804| 58|\n",
      "| 0.751416089| 62|\n",
      "+------------+---+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "prostate.select(col('lcavol'), col('age')).show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `collect_list`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+\n",
      "|  x|\n",
      "+---+\n",
      "|  1|\n",
      "|  2|\n",
      "|  2|\n",
      "|  3|\n",
      "|  4|\n",
      "|  4|\n",
      "|  4|\n",
      "|  4|\n",
      "+---+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "pdf = pd.DataFrame({\n",
    "    'x':[1, 2, 2, 3, 4,4,4,4]\n",
    "})\n",
    "df = spark.createDataFrame(pdf)\n",
    "df.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+\n",
      "|     collect_list(x)|\n",
      "+--------------------+\n",
      "|[1, 2, 2, 3, 4, 4...|\n",
      "+--------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.select(collect_list(df.x)).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `collect_set`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------+\n",
      "|collect_set(x)|\n",
      "+--------------+\n",
      "|  [1, 2, 3, 4]|\n",
      "+--------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.select(collect_set(df.x)).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `concat`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+---+\n",
      "|  x|  v|\n",
      "+---+---+\n",
      "|  a|  1|\n",
      "|  b|  2|\n",
      "+---+---+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df = spark.createDataFrame([['a', '1'], ['b', '2']], ['x', 'v'])\n",
    "df.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+---+------------+\n",
      "|  x|  v|concate(x,v)|\n",
      "+---+---+------------+\n",
      "|  a|  1|          a1|\n",
      "|  b|  2|          b2|\n",
      "+---+---+------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.select('x', 'v', concat(df.x, df.v).alias('concate(x,v)')).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `concat_ws`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+---+------------+\n",
      "|  x|  v|concate(x,v)|\n",
      "+---+---+------------+\n",
      "|  a|  1|         a_1|\n",
      "|  b|  2|         b_2|\n",
      "+---+---+------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.select('x', 'v', concat_ws('_', df.x, df.v).alias('concate(x,v)')).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `conv`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `corr`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+-----------+---+------------+---+------------+-------+-----+------------+\n",
      "|      lcavol|    lweight|age|        lbph|svi|         lcp|gleason|pgg45|        lpsa|\n",
      "+------------+-----------+---+------------+---+------------+-------+-----+------------+\n",
      "|-0.579818495|2.769458829| 50|-1.386294361|  0|-1.386294361|      6|    0|-0.430782916|\n",
      "|-0.994252273|3.319625728| 58|-1.386294361|  0|-1.386294361|      6|    0|-0.162518929|\n",
      "|-0.510825624|2.691243083| 74|-1.386294361|  0|-1.386294361|      7|   20|-0.162518929|\n",
      "|-1.203972804|3.282789151| 58|-1.386294361|  0|-1.386294361|      6|    0|-0.162518929|\n",
      "| 0.751416089|3.432372999| 62|-1.386294361|  0|-1.386294361|      6|    0| 0.371563556|\n",
      "+------------+-----------+---+------------+---+------------+-------+-----+------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "prostate.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------------+\n",
      "|    corr(age, lpsa)|\n",
      "+-------------------+\n",
      "|0.16959284228582772|\n",
      "+-------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "prostate.select(corr(prostate.age, prostate.lpsa)).show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `cos`\n",
    "## `cosh`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `count`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------+\n",
      "|count(lpsa)|\n",
      "+-----------+\n",
      "|         97|\n",
      "+-----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "prostate.select(count(prostate.lpsa)).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `countDistinct`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------+\n",
      "|count(species)|\n",
      "+--------------+\n",
      "|           150|\n",
      "+--------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "iris.select(count(iris.species)).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `covar_pop`\n",
    "\n",
    "**population covariance**: $\\frac{1}{n}\\sum_{i=1}^n(x_{i} - \\bar{x})(y_{i} - \\bar{y})$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+\n",
      "|covar_pop(age, lpsa)|\n",
      "+--------------------+\n",
      "|  1.4424746293984458|\n",
      "+--------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "prostate.select(covar_pop(prostate.age, prostate.lpsa)).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `covar_samp`\n",
    "**sample covariance**: $\\frac{1}{n-1}\\sum_{i=1}^n(x_{i} - \\bar{x})(y_{i} - \\bar{y})$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---------------------+\n",
      "|covar_samp(age, lpsa)|\n",
      "+---------------------+\n",
      "|   1.4575004067880128|\n",
      "+---------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "prostate.select(covar_samp(prostate.age, prostate.lpsa)).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `create_map`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+-----------+------------+-----------+-------+\n",
      "|sepal_length|sepal_width|petal_length|petal_width|species|\n",
      "+------------+-----------+------------+-----------+-------+\n",
      "|         5.1|        3.5|         1.4|        0.2| setosa|\n",
      "|         4.9|        3.0|         1.4|        0.2| setosa|\n",
      "|         4.7|        3.2|         1.3|        0.2| setosa|\n",
      "|         4.6|        3.1|         1.5|        0.2| setosa|\n",
      "|         5.0|        3.6|         1.4|        0.2| setosa|\n",
      "+------------+-----------+------------+-----------+-------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "iris.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------------+\n",
      "|map(species, sepal_length)|\n",
      "+--------------------------+\n",
      "|        Map(setosa -> 5.1)|\n",
      "|        Map(setosa -> 4.9)|\n",
      "|        Map(setosa -> 4.7)|\n",
      "|        Map(setosa -> 4.6)|\n",
      "|        Map(setosa -> 5.0)|\n",
      "+--------------------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df = iris.select(create_map('species', 'sepal_length'))\n",
    "df.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('map(species, sepal_length)', 'map<string,double>')]"
      ]
     },
     "execution_count": 166,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## `cume_dist`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `current_date`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+\n",
      "|  x|\n",
      "+---+\n",
      "|  1|\n",
      "|  2|\n",
      "|  3|\n",
      "|  4|\n",
      "+---+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df = spark.createDataFrame([[1],[2],[3],[4]], ['x'])\n",
    "df.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------------+\n",
      "|  x|current_date()|\n",
      "+---+--------------+\n",
      "|  1|    2017-06-27|\n",
      "|  2|    2017-06-27|\n",
      "|  3|    2017-06-27|\n",
      "|  4|    2017-06-27|\n",
      "+---+--------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.select('x', current_date()).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `current_tmestamp`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+-----------------------+\n",
      "|x  |current_timestamp()    |\n",
      "+---+-----------------------+\n",
      "|1  |2017-06-27 01:18:48.383|\n",
      "|2  |2017-06-27 01:18:48.383|\n",
      "|3  |2017-06-27 01:18:48.383|\n",
      "|4  |2017-06-27 01:18:48.383|\n",
      "+---+-----------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.select('x', current_timestamp()).show(truncate=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `date_add`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+------------+\n",
      "|  x|current_date|\n",
      "+---+------------+\n",
      "|  1|  2017-06-27|\n",
      "|  2|  2017-06-27|\n",
      "|  3|  2017-06-27|\n",
      "|  4|  2017-06-27|\n",
      "+---+------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df2 = df.select('x', current_date().alias('current_date'))\n",
    "df2.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+------------+--------------------------+\n",
      "|  x|current_date|date_add(current_date, 10)|\n",
      "+---+------------+--------------------------+\n",
      "|  1|  2017-06-27|                2017-07-07|\n",
      "|  2|  2017-06-27|                2017-07-07|\n",
      "|  3|  2017-06-27|                2017-07-07|\n",
      "|  4|  2017-06-27|                2017-07-07|\n",
      "+---+------------+--------------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df2.select('x', 'current_date', date_add(df2.current_date, 10)).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `date_format`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+------------+----------+\n",
      "|  x|current_date|  new_date|\n",
      "+---+------------+----------+\n",
      "|  1|  2017-06-27|06/27/2017|\n",
      "|  2|  2017-06-27|06/27/2017|\n",
      "|  3|  2017-06-27|06/27/2017|\n",
      "|  4|  2017-06-27|06/27/2017|\n",
      "+---+------------+----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df2.select('x', 'current_date', date_format('current_date', 'MM/dd/yyyy').alias('new_date')).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
